library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = read.csv('/Users/ankitkumar/Desktop/Loan_Default.csv')
names(df) <- tolower(names(df))
glimpse(df)
## Rows: 148,670
## Columns: 34
## $ id                        <int> 24890, 24891, 24892, 24893, 24894, 24895, 24…
## $ year                      <int> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 20…
## $ loan_limit                <chr> "cf", "cf", "cf", "cf", "cf", "cf", "cf", ""…
## $ gender                    <chr> "Sex Not Available", "Male", "Male", "Male",…
## $ approv_in_adv             <chr> "nopre", "nopre", "pre", "nopre", "pre", "pr…
## $ loan_type                 <chr> "type1", "type2", "type1", "type1", "type1",…
## $ loan_purpose              <chr> "p1", "p1", "p1", "p4", "p1", "p1", "p3", "p…
## $ credit_worthiness         <chr> "l1", "l1", "l1", "l1", "l1", "l1", "l1", "l…
## $ open_credit               <chr> "nopc", "nopc", "nopc", "nopc", "nopc", "nop…
## $ business_or_commercial    <chr> "nob/c", "b/c", "nob/c", "nob/c", "nob/c", "…
## $ loan_amount               <int> 116500, 206500, 406500, 456500, 696500, 7065…
## $ rate_of_interest          <dbl> NA, NA, 4.560, 4.250, 4.000, 3.990, 4.500, 4…
## $ interest_rate_spread      <dbl> NA, NA, 0.2000, 0.6810, 0.3042, 0.1523, 0.99…
## $ upfront_charges           <dbl> NA, NA, 595.00, NA, 0.00, 370.00, 5120.00, 5…
## $ term                      <dbl> 360, 360, 360, 360, 360, 360, 360, 360, 360,…
## $ neg_ammortization         <chr> "not_neg", "not_neg", "neg_amm", "not_neg", …
## $ interest_only             <chr> "not_int", "not_int", "not_int", "not_int", …
## $ lump_sum_payment          <chr> "not_lpsm", "lpsm", "not_lpsm", "not_lpsm", …
## $ property_value            <dbl> 118000, NA, 508000, 658000, 758000, 1008000,…
## $ construction_type         <chr> "sb", "sb", "sb", "sb", "sb", "sb", "sb", "s…
## $ occupancy_type            <chr> "pr", "pr", "pr", "pr", "pr", "pr", "pr", "p…
## $ secured_by                <chr> "home", "home", "home", "home", "home", "hom…
## $ total_units               <chr> "1U", "1U", "1U", "1U", "1U", "1U", "1U", "1…
## $ income                    <dbl> 1740, 4980, 9480, 11880, 10440, 10080, 5040,…
## $ credit_type               <chr> "EXP", "EQUI", "EXP", "EXP", "CRIF", "EXP", …
## $ credit_score              <int> 758, 552, 834, 587, 602, 864, 860, 863, 580,…
## $ co.applicant_credit_type  <chr> "CIB", "EXP", "CIB", "CIB", "EXP", "EXP", "E…
## $ age                       <chr> "25-34", "55-64", "35-44", "45-54", "25-34",…
## $ submission_of_application <chr> "to_inst", "to_inst", "to_inst", "not_inst",…
## $ ltv                       <dbl> 98.72881, NA, 80.01969, 69.37690, 91.88654, …
## $ region                    <chr> "south", "North", "south", "North", "North",…
## $ security_type             <chr> "direct", "direct", "direct", "direct", "dir…
## $ status                    <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ dtir1                     <dbl> 45, NA, 46, 42, 39, 40, 44, 42, 44, 30, 44, …
colnames(df)
##  [1] "id"                        "year"                     
##  [3] "loan_limit"                "gender"                   
##  [5] "approv_in_adv"             "loan_type"                
##  [7] "loan_purpose"              "credit_worthiness"        
##  [9] "open_credit"               "business_or_commercial"   
## [11] "loan_amount"               "rate_of_interest"         
## [13] "interest_rate_spread"      "upfront_charges"          
## [15] "term"                      "neg_ammortization"        
## [17] "interest_only"             "lump_sum_payment"         
## [19] "property_value"            "construction_type"        
## [21] "occupancy_type"            "secured_by"               
## [23] "total_units"               "income"                   
## [25] "credit_type"               "credit_score"             
## [27] "co.applicant_credit_type"  "age"                      
## [29] "submission_of_application" "ltv"                      
## [31] "region"                    "security_type"            
## [33] "status"                    "dtir1"
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
df[df==""] <- NA
missmap(df)

char_col=c()
for(i in 1:34)
  {
    if(is.character(df[,i]))
      {
        char_col=c(char_col,i)
      }
    }
num_col=c()
for(i in 1:34)
  {
      if(is.numeric(df[,i]))
          {
              num_col=c(num_col,i)
    
           }
  }
length(num_col)
## [1] 13
length( char_col)
## [1] 21
sort(char_col)
##  [1]  3  4  5  6  7  8  9 10 16 17 18 20 21 22 23 25 27 28 29 31 32
sort(num_col)
##  [1]  1  2 11 12 13 14 15 19 24 26 30 33 34
charcol=c("loan_limit","gender","approv_in_adv","loan_type","loan_purpose","credit_worthiness","open_credit","business_or_commercial","neg_ammortization","interest_only","property_value","construction_type","occupancy_type","secured_by","total_units","co.applicant_credit_type","age","submission_of_application", "region","security_type","region")
length(charcol)
## [1] 21
numcol=c("id","year","loan_amount","rate_of_interest","interest_rate_spread","upfront_charges","term","lump_sum_payment","income","credit_score","ltv","status","dtir1")
length(numcol)
## [1] 13
for (i in char_col)
  {
    print(unique(df[,i]))
  } 
## [1] "cf"  NA    "ncf"
## [1] "Sex Not Available" "Male"              "Joint"            
## [4] "Female"           
## [1] "nopre" "pre"   NA     
## [1] "type1" "type2" "type3"
## [1] "p1" "p4" "p3" "p2" NA  
## [1] "l1" "l2"
## [1] "nopc" "opc" 
## [1] "nob/c" "b/c"  
## [1] "not_neg" "neg_amm" NA       
## [1] "not_int"  "int_only"
## [1] "not_lpsm" "lpsm"    
## [1] "sb" "mh"
## [1] "pr" "sr" "ir"
## [1] "home" "land"
## [1] "1U" "2U" "3U" "4U"
## [1] "EXP"  "EQUI" "CRIF" "CIB" 
## [1] "CIB" "EXP"
## [1] "25-34" "55-64" "35-44" "45-54" "65-74" ">74"   "<25"   NA     
## [1] "to_inst"  "not_inst" NA        
## [1] "south"      "North"      "central"    "North-East"
## [1] "direct"   "Indriect"
library(patchwork)

df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)], 
                                       as.factor)
gg3=df%>%ggplot(aes(df[,3]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg4=df%>%ggplot(aes(df[,4]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg5=df%>%ggplot(aes(df[,5]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg6=df%>%ggplot(aes(df[,6]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg7=df%>%ggplot(aes(df[,7]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg8=df%>%ggplot(aes(df[,8]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg9=df%>%ggplot(aes(df[,9]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg10=df%>%ggplot(aes(df[,10]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg16=df%>%ggplot(aes(df[,16]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg17=df%>%ggplot(aes(df[,17]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg18=df%>%ggplot(aes(df[,18]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg20=df%>%ggplot(aes(df[,20]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg21=df%>%ggplot(aes(df[,21]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg22=df%>%ggplot(aes(df[,22]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg23=df%>%ggplot(aes(df[,23]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg25=df%>%ggplot(aes(df[,25]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg27=df%>%ggplot(aes(df[,27]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg28=df%>%ggplot(aes(df[,28]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg29=df%>%ggplot(aes(df[,29]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg31=df%>%ggplot(aes(df[,31]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg32=df%>%ggplot(aes(df[,32]))+geom_bar( fill="steelblue")+theme_bw()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

(gg3 |gg4| gg5 |gg6)/(gg7 |gg8| gg9 |gg10)

(gg16 |gg17| gg18 |gg20)/(gg21 |gg22| gg23 |gg25) 

(gg27 |gg28| gg29 )/(gg31|gg32)

missing.values = df %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T)%>% select(-is.missing) %>%
    arrange(desc(num.missing)) 
## Warning: attributes are not identical across measure variables; they will be
## dropped
## `summarise()` has grouped output by 'key'. You can override using the `.groups`
## argument.
missing.values 
## # A tibble: 14 × 2
## # Groups:   key [14]
##    key                       num.missing
##    <chr>                           <int>
##  1 upfront_charges                 39642
##  2 interest_rate_spread            36639
##  3 rate_of_interest                36439
##  4 dtir1                           24121
##  5 ltv                             15098
##  6 property_value                  15098
##  7 income                           9150
##  8 loan_limit                       3344
##  9 approv_in_adv                     908
## 10 age                               200
## 11 submission_of_application         200
## 12 loan_purpose                      134
## 13 neg_ammortization                 121
## 14 term                               41
#
z=c()
for(i in 1:14)
  {
    for(j in 1:34)
    {
      x=colnames(df)
      y=x[j]
      
      if(y==missing.values[i,1])
      
          z=c(z,j)
        }
      
      
    }

z%>%sort()
##  [1]  3  5  7 12 13 14 15 16 19 24 28 29 30 34
df1=df
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
head(df1[,12])
## [1]   NA   NA 4.56 4.25 4.00 3.99
x_with_na=df1[,12]
x_with_imputations= na_kalman(df1[,12])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,12]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,13])
## [1] NA
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,13]
x_with_imputations= na_kalman(df1[,13])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,13]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,14])
## [1] NA
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,14]
x_with_imputations= na_kalman(df1[,14])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,14]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,15])
## [1] 360
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,15]
x_with_imputations= na_kalman(df1[,15])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,15]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 2.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,19])
## [1] 118000
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,19]
x_with_imputations= na_kalman(df1[,19])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,19]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,24])
## [1] 1740
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,24]
x_with_imputations= na_kalman(df1[,24])
## Warning in stats::StructTS(data, ...): possible convergence problem: 'optim'
## gave code = 52 and message 'ERROR: ABNORMAL_TERMINATION_IN_LNSRCH'
df1[,24]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,30])
## [1] 98.72881
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,30]
x_with_imputations= na_kalman(df1[,30])
df1[,30]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 2.5,
size_imputations = 0.5,
size_truth = 2.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

head(df1[,34])
## [1] 45
##  [ reached getOption("max.print") -- omitted 5 entries ]
x_with_na=df1[,34]
x_with_imputations= na_kalman(df1[,34])
df1[,34]=x_with_imputations
mutate(.data = df1,options(max.print = 1))
##      id year loan_limit gender approv_in_adv loan_type loan_purpose
##      credit_worthiness open_credit business_or_commercial loan_amount
##      rate_of_interest interest_rate_spread upfront_charges term
##      neg_ammortization interest_only lump_sum_payment property_value
##      construction_type occupancy_type secured_by total_units income credit_type
##      credit_score co.applicant_credit_type age submission_of_application ltv
##      region security_type status dtir1 options(max.print = 1)
##  [ reached 'max' / getOption("max.print") -- omitted 148670 rows ]
ggplot_na_imputations(
x_with_na,
x_with_imputations,
x_with_truth = NULL,
x_axis_labels = NULL,
title = "Imputed Values",
subtitle = "Visualization of missing value replacements",
xlab = "Time",
ylab = "Value",
color_points = "steelblue",
color_imputations = "indianred",
color_truth = "seagreen3",
color_lines = "lightslategray",
shape_points = 16,
shape_imputations = 9,
shape_truth = 16,
size_points = 1.5,
size_imputations = 0.5,
size_truth = 1.5,
size_lines = 0.5,
linetype = "solid",
connect_na = TRUE,
legend = TRUE,
legend_size = 5,
label_known = "known values",
label_imputations = "imputed values",
label_truth = "ground truth",
theme = ggplot2::theme_linedraw())

options(max.print = 1000)
df1$status = as.factor(df1$status) 
class(df1$status)
## [1] "factor"
df=df1
df=na.omit(df1)
summary(df)
##        id              year      loan_limit                 gender     
##  Min.   : 24890   Min.   :2019   cf :134224   Female           :26405  
##  1st Qu.: 62036   1st Qu.:2019   ncf:  9759   Joint            :40200  
##  Median : 99169   Median :2019                Male             :40995  
##  Mean   : 99210   Mean   :2019                Sex Not Available:36383  
##  3rd Qu.:136384   3rd Qu.:2019                                         
##  Max.   :173559   Max.   :2019                                         
##                                                                        
##  approv_in_adv  loan_type      loan_purpose credit_worthiness open_credit  
##  nopre:121038   type1:109694   p1:33184     l1:137671         nopc:143444  
##  pre  : 22945   type2: 19915   p2: 3150     l2:  6312         opc :   539  
##                 type3: 14374   p3:54102                                    
##                                p4:53547                                    
##                                                                            
##                                                                            
##                                                                            
##  business_or_commercial  loan_amount      rate_of_interest interest_rate_spread
##  b/c  : 19915           Min.   :  16500   Min.   :0.000    Min.   :-3.6380     
##  nob/c:124068           1st Qu.: 196500   1st Qu.:3.750    1st Qu.: 0.1781     
##                         Median : 296500   Median :4.034    Median : 0.4374     
##                         Mean   : 331772   Mean   :4.041    Mean   : 0.4390     
##                         3rd Qu.: 436500   3rd Qu.:4.250    3rd Qu.: 0.6171     
##                         Max.   :3576500   Max.   :8.000    Max.   : 3.3570     
##                                                                                
##  upfront_charges      term       neg_ammortization  interest_only   
##  Min.   :    0   Min.   : 96.0   neg_amm: 14386    int_only:  6826  
##  1st Qu.: 1250   1st Qu.:360.0   not_neg:129597    not_int :137157  
##  Median : 3163   Median :360.0                                      
##  Mean   : 3228   Mean   :335.1                                      
##  3rd Qu.: 3901   3rd Qu.:360.0                                      
##  Max.   :60000   Max.   :360.0                                      
##                                                                     
##  lump_sum_payment  property_value     construction_type occupancy_type
##  lpsm    :  3384   Min.   :    8000   mh:    33         ir:  7053     
##  not_lpsm:140599   1st Qu.:  288000   sb:143950         pr:133903     
##                    Median :  458000                     sr:  3027     
##                    Mean   :  499248                                   
##                    3rd Qu.:  598000                                   
##                    Max.   :16508000                                   
##                                                                       
##  secured_by    total_units     income       credit_type   credit_score  
##  home:143950   1U:141877   Min.   :     0   CIB :46738   Min.   :500.0  
##  land:    33   2U:  1432   1st Qu.:  3840   CRIF:42560   1st Qu.:599.0  
##                3U:   371   Median :  6000   EQUI:14609   Median :699.0  
##                4U:   303   Mean   :  6962   EXP :40076   Mean   :699.7  
##                            3rd Qu.:  8280                3rd Qu.:800.0  
##                            Max.   :578580                Max.   :900.0  
##                                                                         
##  co.applicant_credit_type    age        submission_of_application
##  CIB:72056                <25  : 1295   not_inst:51069           
##  EXP:71927                >74  : 6987   to_inst :92914           
##                           25-34:18494                            
##                           35-44:31816                            
##                           45-54:33688                            
##                           55-64:31550                            
##                           65-74:20153                            
##       ltv                  region       security_type    status    
##  Min.   :   0.967   central   : 8408   direct  :143950   0:108713  
##  1st Qu.:  63.142   North     :72427   Indriect:    33   1: 35270  
##  Median :  73.534   North-East: 1207                               
##  Mean   :  72.701   south     :61941                               
##  3rd Qu.:  84.574                                                  
##  Max.   :7831.250                                                  
##                                                                    
##      dtir1      
##  Min.   : 5.00  
##  1st Qu.:33.00  
##  Median :37.94  
##  Mean   :37.72  
##  3rd Qu.:44.00  
##  Max.   :61.00  
## 

Visual EDA

Correlation matrix

library(corrplot)
## corrplot 0.92 loaded
# Correlation matrix
loan_cor <- df %>% 
  select(where(is.numeric)) %>% 
  drop_na() %>% 
  cor()
## Warning in cor(.): the standard deviation is zero
corrplot(loan_cor, method = "circle", addCoef.col = 1, number.cex = 0.7)

loan_cor <- df %>% 
  select(c("loan_amount","property_value","income","loan_amount","interest_rate_spread","rate_of_interest","ltv")) %>% 
  drop_na() %>% 
  cor()

corrplot(loan_cor, method = "circle", addCoef.col = 1, number.cex = 0.7)

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Plots
df %>% 
  select("loan_amount","income","age","property_value","rate_of_interest","interest_rate_spread")%>% 
  ggpairs(mapping = aes(color = df$status, alpha = 0.5))+theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library("gridExtra")
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "income", y = "loan_amount") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "loan_amount") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "income", y = "loan_amount") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "income") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "age", y = "property_value") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "rate_of_interest") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = 'interest_rate_spread') +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "property_value", y = "loan_amount") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "age", y = "property_value") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "income", y = "property_value") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = 'property_value') +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "rate_of_interest") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "age", y = "interest_rate_spread") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "loan_amount", y = "property_value") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = 'property_value') +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "loan_amount", y = "rate_of_interest") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "income", y = "rate_of_interest") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "rate_of_interest") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "property_value", y = 'rate_of_interest') +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "loan_amount", y = "interest_rate_spread") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

# Loan amount vs. annual income
p1 = df %>% filter(income < 500000) %>% 
ggplot(aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  theme_bw() +
  labs(x = "income", y = "interest_rate_spread") +
  theme(plot.background = element_rect(fill = "white"))

# loan to income ratio vs. annual income
p2 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "age", y = "interest_rate_spread") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# Loan amount vs. loan to income ratio
p4 <- ggplot(df, aes(x = income, y = loan_amount)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "property_value", y = 'interest_rate_spread') +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

# credit history vs. age
p3 <- ggplot(df, aes(x = age, y = credit_score)) +
  geom_point(alpha=0.3, position = "jitter", color = "darkblue") +
  labs(x = "rate_of_interest", y = "interest_rate_spread") +
  theme_bw() +
  theme(plot.background = element_rect(fill = "white"))

grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

library(DataExplorer)
# Bar plots
plot_bar(df, by = "status",
         ggtheme = theme_bw(),
         nrow = 4,
         ncol = 1,
         parallel = TRUE)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
df %>%
  plot_ly(
    x = ~df$loan_amount, 
    y = ~df$rate_of_interest, 
    color = ~df$status,
    colors = "Set2",
    frame = ~df$age, 
    text = ~df$status, 
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  ) %>%
  plotly::layout(
    xaxis = list(title = "Loan Amount"),
    yaxis = list(title = "Loan Interest Rate")
  )
plot_ly() %>% 
  add_trace(data = df,
            y = ~df$rate_of_interest,
            color = ~df$age,
            colors = c("#f5ed04","#de1b1d"),
            type = "box") %>% 
  plotly::layout(xaxis = list(title = "age"),
                 yaxis = list(title = "Loan Interest Rate"))
plot_ly() %>% 
  add_trace(data = df,
            y = ~df$rate_of_interest,
            color = ~df$status,
            colors = "Dark2",
            type = "box") %>% 
  plotly::layout(xaxis = list(title = "Historical Default"),
                 yaxis = list(title = "Loan Interest Rate"))

outliers

boxplot(df)

p1=boxplot(df$loan_amount , horizontal = TRUE, main = "loan_amount")

p2=boxplot(df$rate_of_interest , horizontal = TRUE, main = "rate_of_interest")

p3=boxplot(df$interest_rate_spread , horizontal = TRUE, main = "interest_rate_spread")

p4=boxplot(df$upfront_charges  , horizontal = TRUE, main = "upfront_charges ")

p5=boxplot(df$term , horizontal = TRUE, main = "term")

p6=boxplot(df$property_value  , horizontal = TRUE, main = "property_value ")

p7=boxplot(df$income , horizontal = TRUE, main = "income")

p8=boxplot(df$credit_score  , horizontal = TRUE, main = "credit_score ")

p9=boxplot(df$ltv  , horizontal = TRUE, main = "ltv ")

p10=boxplot(df$dtir1  , horizontal = TRUE, main = "dtir1 ")

data=df
dim(data)
## [1] 143983     34
for (i in 1:32)
{
  if(is.numeric(df[,i]))
  {
    quartiles <- quantile(data[,i], probs=c(.25, .75), na.rm = FALSE)
    IQR <- IQR(data[,i])
    
    Lower <- quartiles[1] - 1.5*IQR
    Upper <- quartiles[2] + 1.5*IQR
    
    data_no_outlier <- subset(data, data[,i] > Lower & data[,i]< Upper)
    data=data_no_outlier
  }
}


dim(data_no_outlier)
## [1]  0 34
library(rsample)
## 
## Attaching package: 'rsample'
## The following object is masked from 'package:Rcpp':
## 
##     populate
library(dplyr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
set.seed(222)
df=df[,-2]
df=df[,-1]
# Splitting the data into training and testing sets
df_split <- initial_split(df, prop = 0.75, strata = status)
train_set <- training(df_split)
test_set <- testing(df_split)


table_split <- data.frame(
  Dataset = c("Training Set", "Testing Set"),
  Count = c(count(train_set)[1,], count(test_set)[1,1])
)

kable(table_split) %>% 
  kable_styling(bootstrap_options = "bordered",
                 full_width = FALSE)
Dataset Count
Training Set 107986
Testing Set 35997

Logistic Regression

logistics_classifier = glm(formula = status ~ .,
                           family = binomial,
                           data = train_set)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistics_classifier)
## 
## Call:
## glm(formula = status ~ ., family = binomial, data = train_set)
## 
## Coefficients: (2 not defined because of singularities)
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       3.056e+12  4.433e+12   0.689 0.490633    
## loan_limitncf                     7.173e-01  3.561e-02  20.145  < 2e-16 ***
## genderJoint                      -9.000e-03  3.922e-02  -0.229 0.818486    
## genderMale                        9.968e-02  2.783e-02   3.582 0.000341 ***
## genderSex Not Available           1.146e-02  3.553e-02   0.323 0.746975    
## approv_in_advpre                 -2.380e-01  2.765e-02  -8.607  < 2e-16 ***
## loan_typetype2                    7.704e-01  3.184e-02  24.193  < 2e-16 ***
## loan_typetype3                   -7.120e-01  3.839e-02 -18.544  < 2e-16 ***
## loan_purposep2                    9.600e-01  6.400e-02  14.999  < 2e-16 ***
## loan_purposep3                    3.250e-01  2.955e-02  10.997  < 2e-16 ***
## loan_purposep4                    1.303e-01  2.871e-02   4.540 5.63e-06 ***
## credit_worthinessl2               4.168e-01  4.412e-02   9.448  < 2e-16 ***
## open_creditopc                   -2.800e-01  1.870e-01  -1.497 0.134350    
## business_or_commercialnob/c              NA         NA      NA       NA    
## loan_amount                       7.739e-08  1.114e-07   0.695 0.487238    
## rate_of_interest                  7.059e-02  2.952e-02   2.392 0.016778 *  
## interest_rate_spread             -9.329e-01  3.471e-02 -26.876  < 2e-16 ***
## upfront_charges                  -5.011e-05  3.893e-06 -12.872  < 2e-16 ***
## term                             -1.388e-03  1.917e-04  -7.242 4.43e-13 ***
## neg_ammortizationnot_neg         -9.234e-01  2.710e-02 -34.070  < 2e-16 ***
## interest_onlynot_int             -3.670e-01  4.342e-02  -8.453  < 2e-16 ***
## lump_sum_paymentnot_lpsm         -2.599e+00  5.651e-02 -45.985  < 2e-16 ***
## property_value                    4.627e-07  5.452e-08   8.487  < 2e-16 ***
## construction_typesb              -3.056e+12  4.433e+12  -0.689 0.490633    
## occupancy_typepr                 -1.041e+00  4.491e-02 -23.174  < 2e-16 ***
## occupancy_typesr                 -4.933e-01  7.607e-02  -6.484 8.90e-11 ***
## secured_byland                   -3.056e+12  4.433e+12  -0.689 0.490633    
## total_units2U                     8.218e-01  8.169e-02  10.060  < 2e-16 ***
## total_units3U                     1.120e+00  1.535e-01   7.298 2.93e-13 ***
## total_units4U                     4.294e-01  1.889e-01   2.273 0.023003 *  
## income                           -5.234e-05  3.171e-06 -16.508  < 2e-16 ***
## credit_typeCRIF                   4.764e-02  2.249e-02   2.118 0.034140 *  
## credit_typeEQUI                   1.256e+01  1.289e+00   9.744  < 2e-16 ***
## credit_typeEXP                   -2.498e-02  2.310e-02  -1.082 0.279429    
## credit_score                      1.305e-04  8.056e-05   1.620 0.105267    
## co.applicant_credit_typeEXP      -2.869e-01  2.844e-02 -10.087  < 2e-16 ***
## age>74                           -5.844e-02  1.029e-01  -0.568 0.570160    
## age25-34                         -4.057e-01  9.691e-02  -4.186 2.84e-05 ***
## age35-44                         -3.892e-01  9.588e-02  -4.059 4.92e-05 ***
## age45-54                         -2.423e-01  9.590e-02  -2.527 0.011506 *  
## age55-64                         -1.450e-01  9.610e-02  -1.509 0.131218    
## age65-74                         -1.851e-01  9.753e-02  -1.898 0.057679 .  
## submission_of_applicationto_inst  1.027e+00  2.786e-02  36.854  < 2e-16 ***
## ltv                               1.932e-02  8.445e-04  22.882  < 2e-16 ***
## regionNorth                      -2.756e-01  3.897e-02  -7.072 1.53e-12 ***
## regionNorth-East                  3.359e-02  9.768e-02   0.344 0.730938    
## regionsouth                      -1.099e-01  4.296e-02  -2.559 0.010508 *  
## security_typeIndriect                    NA         NA      NA       NA    
## dtir1                             8.836e-03  1.012e-03   8.730  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 120238  on 107985  degrees of freedom
## Residual deviance:  74703  on 107939  degrees of freedom
## AIC: 74797
## 
## Number of Fisher Scoring iterations: 25

Prediction using Logistics Regressor

prob_pred = predict(logistics_classifier, type = 'response', newdata = test_set)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
y_pred = ifelse(prob_pred > 0.5, 1, 0)

Confusion Matrix

cm = table(ActualValue=test_set$status, PredictedValue=prob_pred > 0.5)
cm
##            PredictedValue
## ActualValue FALSE  TRUE
##           0 26919   260
##           1  4520  4298

#Estimating the percentage of performance

sum(diag(cm))/sum(cm)
## [1] 0.8672112

Decision Tree

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
## 
##     where
Tree_Classifer = ctree(status ~ .,
                       data = train_set)
Tree_Classifer
## 
##   Conditional inference tree with 79 terminal nodes
## 
## Response:  status 
## Inputs:  loan_limit, gender, approv_in_adv, loan_type, loan_purpose, credit_worthiness, open_credit, business_or_commercial, loan_amount, rate_of_interest, interest_rate_spread, upfront_charges, term, neg_ammortization, interest_only, lump_sum_payment, property_value, construction_type, occupancy_type, secured_by, total_units, income, credit_type, credit_score, co.applicant_credit_type, age, submission_of_application, ltv, region, security_type, dtir1 
## Number of observations:  107986 
## 
## 1) credit_type == {EQUI}; criterion = 1, statistic = 37684.315
##   2) rate_of_interest <= 4.088513; criterion = 1, statistic = 5491
##     3)*  weights = 10873 
##   2) rate_of_interest > 4.088513
##     4) dtir1 <= 37.65324; criterion = 1, statistic = 110
##       5)*  weights = 7 
##     4) dtir1 > 37.65324
##       6)*  weights = 104 
## 1) credit_type == {CIB, CRIF, EXP}
##   7) lump_sum_payment == {lpsm}; criterion = 1, statistic = 3177.545
##     8) submission_of_application == {not_inst}; criterion = 1, statistic = 57.953
##       9) upfront_charges <= 2762.44; criterion = 1, statistic = 72.462
##         10)*  weights = 188 
##       9) upfront_charges > 2762.44
##         11) upfront_charges <= 3550.096; criterion = 1, statistic = 112.678
##           12) rate_of_interest <= 3.75; criterion = 1, statistic = 80.85
##             13)*  weights = 7 
##           12) rate_of_interest > 3.75
##             14) rate_of_interest <= 4.085726; criterion = 1, statistic = 65.9
##               15) interest_rate_spread <= 0.4149882; criterion = 1, statistic = 60.867
##                 16)*  weights = 7 
##               15) interest_rate_spread > 0.4149882
##                 17)*  weights = 258 
##             14) rate_of_interest > 4.085726
##               18)*  weights = 7 
##         11) upfront_charges > 3550.096
##           19)*  weights = 40 
##     8) submission_of_application == {to_inst}
##       20) approv_in_adv == {pre}; criterion = 1, statistic = 59.865
##         21)*  weights = 23 
##       20) approv_in_adv == {nopre}
##         22) upfront_charges <= 3521.373; criterion = 1, statistic = 37.086
##           23) upfront_charges <= 2946.86; criterion = 1, statistic = 534.417
##             24)*  weights = 126 
##           23) upfront_charges > 2946.86
##             25) interest_rate_spread <= 0.47906; criterion = 1, statistic = 188.487
##               26) interest_rate_spread <= 0.400631; criterion = 1, statistic = 353.412
##                 27)*  weights = 10 
##               26) interest_rate_spread > 0.400631
##                 28) rate_of_interest <= 4.086297; criterion = 1, statistic = 364.512
##                   29)*  weights = 824 
##                 28) rate_of_interest > 4.086297
##                   30)*  weights = 10 
##             25) interest_rate_spread > 0.47906
##               31)*  weights = 23 
##         22) upfront_charges > 3521.373
##           32)*  weights = 146 
##   7) lump_sum_payment == {not_lpsm}
##     33) neg_ammortization == {neg_amm}; criterion = 1, statistic = 1950.56
##       34) credit_worthiness == {l2}; criterion = 1, statistic = 1061.948
##         35) rate_of_interest <= 4.541252; criterion = 1, statistic = 496.984
##           36)*  weights = 492 
##         35) rate_of_interest > 4.541252
##           37)*  weights = 26 
##       34) credit_worthiness == {l1}
##         38) rate_of_interest <= 4.106765; criterion = 1, statistic = 762.709
##           39) rate_of_interest <= 4; criterion = 1, statistic = 1874.547
##             40) occupancy_type == {ir}; criterion = 1, statistic = 33.595
##               41)*  weights = 28 
##             40) occupancy_type == {pr, sr}
##               42) rate_of_interest <= 3.99; criterion = 0.993, statistic = 13.716
##                 43) age == {<25, 25-34, 45-54}; criterion = 1, statistic = 39.722
##                   44) age == {<25}; criterion = 0.951, statistic = 12.852
##                     45)*  weights = 12 
##                   44) age == {25-34, 45-54}
##                     46)*  weights = 684 
##                 43) age == {>74, 35-44, 55-64, 65-74}
##                   47)*  weights = 1113 
##               42) rate_of_interest > 3.99
##                 48) rate_of_interest <= 3.998689; criterion = 1, statistic = 52.107
##                   49)*  weights = 9 
##                 48) rate_of_interest > 3.998689
##                   50)*  weights = 62 
##           39) rate_of_interest > 4
##             51) interest_rate_spread <= 0.4111201; criterion = 1, statistic = 887.192
##               52) interest_rate_spread <= 0.3693723; criterion = 1, statistic = 21.589
##                 53)*  weights = 7 
##               52) interest_rate_spread > 0.3693723
##                 54)*  weights = 17 
##             51) interest_rate_spread > 0.4111201
##               55)*  weights = 2270 
##         38) rate_of_interest > 4.106765
##           56) rate_of_interest <= 4.241421; criterion = 1, statistic = 19.05
##             57) rate_of_interest <= 4.18; criterion = 1, statistic = 94.922
##               58) loan_purpose == {p2, p4}; criterion = 1, statistic = 49.019
##                 59)*  weights = 74 
##               58) loan_purpose == {p1, p3}
##                 60)*  weights = 205 
##             57) rate_of_interest > 4.18
##               61)*  weights = 8 
##           56) rate_of_interest > 4.241421
##             62) business_or_commercial == {b/c}; criterion = 0.998, statistic = 15.896
##               63) interest_rate_spread <= 0.4712; criterion = 1, statistic = 26.003
##                 64)*  weights = 7 
##               63) interest_rate_spread > 0.4712
##                 65)*  weights = 468 
##             62) business_or_commercial == {nob/c}
##               66)*  weights = 3659 
##     33) neg_ammortization == {not_neg}
##       67) submission_of_application == {to_inst}; criterion = 1, statistic = 1177.585
##         68) loan_type == {type2}; criterion = 1, statistic = 684.472
##           69) interest_rate_spread <= 0.4789281; criterion = 1, statistic = 3261.762
##             70) rate_of_interest <= 3.99; criterion = 1, statistic = 2150.126
##               71) rate_of_interest <= 3.875; criterion = 1, statistic = 71.159
##                 72)*  weights = 446 
##               71) rate_of_interest > 3.875
##                 73) rate_of_interest <= 3.989792; criterion = 0.994, statistic = 13.857
##                   74)*  weights = 15 
##                 73) rate_of_interest > 3.989792
##                   75)*  weights = 15 
##             70) rate_of_interest > 3.99
##               76) upfront_charges <= 2695.539; criterion = 1, statistic = 1480.899
##                 77) interest_rate_spread <= 0.3485905; criterion = 0.998, statistic = 16.438
##                   78)*  weights = 12 
##                 77) interest_rate_spread > 0.3485905
##                   79)*  weights = 12 
##               76) upfront_charges > 2695.539
##                 80)*  weights = 2201 
##           69) interest_rate_spread > 0.4789281
##             81)*  weights = 5834 
##         68) loan_type == {type1, type3}
##           82) upfront_charges <= 3484.874; criterion = 1, statistic = 841.207
##             83) upfront_charges <= 3000.37; criterion = 1, statistic = 6023.67
##               84) upfront_charges <= 2944; criterion = 1, statistic = 186.606
##                 85) upfront_charges <= 1182.32; criterion = 0.996, statistic = 14.568
##                   86)*  weights = 5056 
##                 85) upfront_charges > 1182.32
##                   87) loan_amount <= 466500; criterion = 1, statistic = 21.828
##                     88) income <= 18840; criterion = 0.973, statistic = 12.336
##                       89)*  weights = 7459 
##                     88) income > 18840
##                       90)*  weights = 77 
##                   87) loan_amount > 466500
##                     91)*  weights = 414 
##               84) upfront_charges > 2944
##                 92) ltv <= 83.29918; criterion = 1, statistic = 34.69
##                   93) loan_amount <= 236500; criterion = 1, statistic = 23.359
##                     94) interest_rate_spread <= 0.4481444; criterion = 0.989, statistic = 12.778
##                       95) interest_rate_spread <= 0.4268; criterion = 0.992, statistic = 13.303
##                         96)*  weights = 59 
##                       95) interest_rate_spread > 0.4268
##                         97)*  weights = 25 
##                     94) interest_rate_spread > 0.4481444
##                       98)*  weights = 198 
##                   93) loan_amount > 236500
##                     99)*  weights = 114 
##                 92) ltv > 83.29918
##                   100)*  weights = 103 
##             83) upfront_charges > 3000.37
##               101) interest_rate_spread <= 0.4788959; criterion = 1, statistic = 956.05
##                 102) interest_rate_spread <= 0.4065; criterion = 1, statistic = 3512.229
##                   103) occupancy_type == {ir, sr}; criterion = 1, statistic = 108.952
##                     104)*  weights = 44 
##                   103) occupancy_type == {pr}
##                     105) interest_rate_spread <= 0.3909; criterion = 0.958, statistic = 148.516
##                       106)*  weights = 1426 
##                     105) interest_rate_spread > 0.3909
##                       107)*  weights = 69 
##                 102) interest_rate_spread > 0.4065
##                   108) rate_of_interest <= 3.99; criterion = 1, statistic = 682.975
##                     109) interest_rate_spread <= 0.4100713; criterion = 1, statistic = 32.033
##                       110)*  weights = 19 
##                     109) interest_rate_spread > 0.4100713
##                       111)*  weights = 144 
##                   108) rate_of_interest > 3.99
##                     112) rate_of_interest <= 4.096569; criterion = 1, statistic = 4946.938
##                       113)*  weights = 6507 
##                     112) rate_of_interest > 4.096569
##                       114)*  weights = 126 
##               101) interest_rate_spread > 0.4788959
##                 115)*  weights = 2930 
##           82) upfront_charges > 3484.874
##             116) upfront_charges <= 3548.442; criterion = 1, statistic = 173.011
##               117) loan_amount <= 236500; criterion = 0.992, statistic = 13.273
##                 118)*  weights = 338 
##               117) loan_amount > 236500
##                 119)*  weights = 148 
##             116) upfront_charges > 3548.442
##               120)*  weights = 20473 
##       67) submission_of_application == {not_inst}
##         121) interest_rate_spread <= 0.4089; criterion = 1, statistic = 609.037
##           122) business_or_commercial == {nob/c}; criterion = 1, statistic = 53.747
##             123) interest_rate_spread <= 0.4002; criterion = 1, statistic = 36.016
##               124) interest_rate_spread <= 0.3275; criterion = 1, statistic = 22.918
##                 125) loan_purpose == {p1, p4}; criterion = 0.999, statistic = 22.866
##                   126)*  weights = 15022 
##                 125) loan_purpose == {p2, p3}
##                   127) occupancy_type == {ir}; criterion = 0.996, statistic = 17.991
##                     128)*  weights = 61 
##                   127) occupancy_type == {pr, sr}
##                     129)*  weights = 3528 
##               124) interest_rate_spread > 0.3275
##                 130) dtir1 <= 50; criterion = 0.981, statistic = 11.712
##                   131)*  weights = 1575 
##                 130) dtir1 > 50
##                   132)*  weights = 17 
##             123) interest_rate_spread > 0.4002
##               133)*  weights = 206 
##           122) business_or_commercial == {b/c}
##             134) rate_of_interest <= 3.99; criterion = 1, statistic = 63.091
##               135) rate_of_interest <= 3.875; criterion = 0.994, statistic = 13.843
##                 136)*  weights = 548 
##               135) rate_of_interest > 3.875
##                 137)*  weights = 11 
##             134) rate_of_interest > 3.99
##               138)*  weights = 7 
##         121) interest_rate_spread > 0.4089
##           139) interest_rate_spread <= 0.4696689; criterion = 1, statistic = 2284.793
##             140) upfront_charges <= 2925.97; criterion = 1, statistic = 465.181
##               141)*  weights = 941 
##             140) upfront_charges > 2925.97
##               142) upfront_charges <= 3546.892; criterion = 1, statistic = 1264.361
##                 143) rate_of_interest <= 3.99; criterion = 0.997, statistic = 15.249
##                   144) interest_rate_spread <= 0.4138774; criterion = 0.998, statistic = 16.29
##                     145)*  weights = 9 
##                   144) interest_rate_spread > 0.4138774
##                     146)*  weights = 23 
##                 143) rate_of_interest > 3.99
##                   147) rate_of_interest <= 4.090069; criterion = 1, statistic = 1206.023
##                     148) rate_of_interest <= 4.004944; criterion = 0.991, statistic = 13.22
##                       149)*  weights = 27 
##                     148) rate_of_interest > 4.004944
##                       150)*  weights = 2520 
##                   147) rate_of_interest > 4.090069
##                     151) interest_rate_spread <= 0.4514; criterion = 0.996, statistic = 14.883
##                       152)*  weights = 14 
##                     151) interest_rate_spread > 0.4514
##                       153)*  weights = 13 
##               142) upfront_charges > 3546.892
##                 154)*  weights = 246 
##           139) interest_rate_spread > 0.4696689
##             155) interest_rate_spread <= 0.4789182; criterion = 1, statistic = 39.409
##               156)*  weights = 183 
##             155) interest_rate_spread > 0.4789182
##               157)*  weights = 6977
plot(Tree_Classifer)

# Prediction using the Decision Tree

pred = predict(Tree_Classifer,newdata = test_set)
cm = table(ActualValue=test_set$status, PredictedValue=pred)
cm
##            PredictedValue
## ActualValue     0     1
##           0 27167    12
##           1    94  8724

estimating the percentage of performance

sum(diag(cm))/sum(cm)
## [1] 0.9970553

Random Forest

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(222)
rf_classifier <- randomForest(status ~ ., data = train_set)

str(rf_classifier)
## List of 19
##  $ call           : language randomForest(formula = status ~ ., data = train_set)
##  $ type           : chr "classification"
##  $ predicted      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:107986] "3" "4" "5" "6" ...
##  $ err.rate       : num [1:500, 1:3] 0.00737 0.00705 0.00584 0.00528 0.0048 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:3] "OOB" "0" "1"
##  $ confusion      : num [1:2, 1:3] 81534 99 0 26353 0 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "0" "1"
##   .. ..$ : chr [1:3] "0" "1" "class.error"
##  $ votes          : 'matrix' num [1:107986, 1:2] 1 1 1 1 1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:107986] "3" "4" "5" "6" ...
##   .. ..$ : chr [1:2] "0" "1"
##  $ oob.times      : num [1:107986] 193 185 187 198 206 180 181 187 178 178 ...
##  $ classes        : chr [1:2] "0" "1"
##  $ importance     : num [1:31, 1] 19.6 38.9 18.3 214.1 94.9 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
##   .. ..$ : chr "MeanDecreaseGini"
##  $ importanceSD   : NULL
##  $ localImportance: NULL
##  $ proximity      : NULL
##  $ ntree          : num 500
##  $ mtry           : num 5
##  $ forest         :List of 14
##   ..$ ndbigtree : int [1:500] 1133 1169 799 825 1399 1369 1307 841 1045 691 ...
##   ..$ nodestatus: int [1:2761, 1:500] 1 1 1 1 1 1 1 -1 1 1 ...
##   ..$ bestvar   : int [1:2761, 1:500] 11 22 11 12 11 23 11 0 29 23 ...
##   ..$ treemap   : int [1:2761, 1:2, 1:500] 2 4 6 8 10 12 14 0 16 18 ...
##   ..$ nodepred  : int [1:2761, 1:500] 0 0 0 0 0 0 0 2 0 0 ...
##   ..$ xbestsplit: num [1:2761, 1:500] 0.408 330 0.47 3218.194 0.406 ...
##   ..$ pid       : num [1:2] 1 1
##   ..$ cutoff    : num [1:2] 0.5 0.5
##   ..$ ncat      : Named int [1:31] 2 4 2 3 4 2 2 2 1 1 ...
##   .. ..- attr(*, "names")= chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
##   ..$ maxcat    : int 7
##   ..$ nrnodes   : int 2761
##   ..$ ntree     : num 500
##   ..$ nclass    : int 2
##   ..$ xlevels   :List of 31
##   .. ..$ loan_limit               : chr [1:2] "cf" "ncf"
##   .. ..$ gender                   : chr [1:4] "Female" "Joint" "Male" "Sex Not Available"
##   .. ..$ approv_in_adv            : chr [1:2] "nopre" "pre"
##   .. ..$ loan_type                : chr [1:3] "type1" "type2" "type3"
##   .. ..$ loan_purpose             : chr [1:4] "p1" "p2" "p3" "p4"
##   .. ..$ credit_worthiness        : chr [1:2] "l1" "l2"
##   .. ..$ open_credit              : chr [1:2] "nopc" "opc"
##   .. ..$ business_or_commercial   : chr [1:2] "b/c" "nob/c"
##   .. ..$ loan_amount              : num 0
##   .. ..$ rate_of_interest         : num 0
##   .. ..$ interest_rate_spread     : num 0
##   .. ..$ upfront_charges          : num 0
##   .. ..$ term                     : num 0
##   .. ..$ neg_ammortization        : chr [1:2] "neg_amm" "not_neg"
##   .. ..$ interest_only            : chr [1:2] "int_only" "not_int"
##   .. ..$ lump_sum_payment         : chr [1:2] "lpsm" "not_lpsm"
##   .. ..$ property_value           : num 0
##   .. ..$ construction_type        : chr [1:2] "mh" "sb"
##   .. ..$ occupancy_type           : chr [1:3] "ir" "pr" "sr"
##   .. ..$ secured_by               : chr [1:2] "home" "land"
##   .. ..$ total_units              : chr [1:4] "1U" "2U" "3U" "4U"
##   .. ..$ income                   : num 0
##   .. ..$ credit_type              : chr [1:4] "CIB" "CRIF" "EQUI" "EXP"
##   .. ..$ credit_score             : num 0
##   .. ..$ co.applicant_credit_type : chr [1:2] "CIB" "EXP"
##   .. ..$ age                      : chr [1:7] "<25" ">74" "25-34" "35-44" ...
##   .. ..$ submission_of_application: chr [1:2] "not_inst" "to_inst"
##   .. ..$ ltv                      : num 0
##   .. ..$ region                   : chr [1:4] "central" "North" "North-East" "south"
##   .. ..$ security_type            : chr [1:2] "direct" "Indriect"
##   .. ..$ dtir1                    : num 0
##  $ y              : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:107986] "3" "4" "5" "6" ...
##  $ test           : NULL
##  $ inbag          : NULL
##  $ terms          :Classes 'terms', 'formula'  language status ~ loan_limit + gender + approv_in_adv + loan_type + loan_purpose +      credit_worthiness + open_credit + | __truncated__ ...
##   .. ..- attr(*, "variables")= language list(status, loan_limit, gender, approv_in_adv, loan_type, loan_purpose,      credit_worthiness, open_credit, bus| __truncated__ ...
##   .. ..- attr(*, "factors")= int [1:32, 1:31] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:32] "status" "loan_limit" "gender" "approv_in_adv" ...
##   .. .. .. ..$ : chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
##   .. ..- attr(*, "term.labels")= chr [1:31] "loan_limit" "gender" "approv_in_adv" "loan_type" ...
##   .. ..- attr(*, "order")= int [1:31] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= num 0
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(status, loan_limit, gender, approv_in_adv, loan_type, loan_purpose,      credit_worthiness, open_credit, bus| __truncated__ ...
##   .. ..- attr(*, "dataClasses")= Named chr [1:32] "factor" "factor" "factor" "factor" ...
##   .. .. ..- attr(*, "names")= chr [1:32] "status" "loan_limit" "gender" "approv_in_adv" ...
##  - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"
attributes(rf_classifier)
## $names
##  [1] "call"            "type"            "predicted"       "err.rate"       
##  [5] "confusion"       "votes"           "oob.times"       "classes"        
##  [9] "importance"      "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"          "y"              
## [17] "test"            "inbag"           "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"

Confusion Matrix

rf_pred = predict(rf_classifier,test_set)

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
confusionMatrix(rf_pred,test_set$status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 27179    32
##          1     0  8786
##                                           
##                Accuracy : 0.9991          
##                  95% CI : (0.9987, 0.9994)
##     No Information Rate : 0.755           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9976          
##                                           
##  Mcnemar's Test P-Value : 4.251e-08       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9964          
##          Pos Pred Value : 0.9988          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7550          
##          Detection Rate : 0.7550          
##    Detection Prevalence : 0.7559          
##       Balanced Accuracy : 0.9982          
##                                           
##        'Positive' Class : 0               
## 
plot(rf_classifier)

Determining the most important variable in the forest

varImpPlot(rf_classifier)

importance(rf_classifier)
##                           MeanDecreaseGini
## loan_limit                    1.960654e+01
## gender                        3.890623e+01
## approv_in_adv                 1.834797e+01
## loan_type                     2.141204e+02
## loan_purpose                  9.492187e+01
## credit_worthiness             2.690797e+01
## open_credit                   1.084549e+01
## business_or_commercial        1.325256e+02
## loan_amount                   1.752984e+02
## rate_of_interest              1.306849e+04
## interest_rate_spread          1.001180e+04
## upfront_charges               7.267285e+03
## term                          8.797034e+01
## neg_ammortization             1.224602e+02
## interest_only                 8.573198e+00
## lump_sum_payment              3.127223e+02
## property_value                7.396553e+02
## construction_type             1.740026e-01
## occupancy_type                1.863543e+01
## secured_by                    1.349490e-01
## total_units                   4.530986e+00
## income                        2.824626e+02
## credit_type                   5.423771e+03
## credit_score                  5.258760e+01
## co.applicant_credit_type      1.425788e+02
## age                           4.127812e+01
## submission_of_application     1.062712e+02
## ltv                           7.201919e+02
## region                        1.598862e+01
## security_type                 1.851919e-01
## dtir1                         7.657246e+02